from PIL import Image # read images
import requests
from io import BytesIO
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup # web scraping
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import nltk
import math
import time
import re
import os
import seaborn as sns
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from matplotlib import gridspec
from scipy.sparse import hstack
import plotly
import plotly.figure_factory as ff
from plotly.graph_objs import Scatter, Layout
import itertools
plotly.offline.init_notebook_mode(connected=True)
import warnings
warnings.filterwarnings("ignore")
data = pd.read_json('tops_fashion.json')
data.shape
data.columns
I will use only 6 features.
1. asin ( Amazon standard identification number)(unique number)
2. brand ( brand to which the product belongs to )
3. color ( Color information of apparel, it can contain many colors as a value ex: red and black stripes )
4. product_type_name (type of the apperal, ex: SHIRT/TSHIRT )
5. medium_image_url ( url of the image )
6. title (title of the product.) (information about products)
7. formatted_price (price of the product)
data = data[['asin', 'brand', 'color', 'medium_image_url', 'product_type_name', 'title', 'formatted_price']]
data.head()
# Count of Products:
data["product_type_name"].nunique()
# Missng Values Control:
data["product_type_name"].isnull().sum()
# Percentage of Products: Top 10
data["product_type_name"].value_counts()[:10] # to see percentage:normalize=True
# Count of Brand:
data["brand"].nunique()
# Missng Values Control:
data["brand"].isnull().sum()
# Percentage of Products: Top 10
data["brand"].value_counts()[:10]
# Count of Color:
data["color"].nunique()
# Missng Values Control:
data["color"].isnull().sum()
# Percentage of Products: Top 10
data["color"].value_counts()[:10]
# Missng Values Control:
data["formatted_price"].isnull().sum()
# Percentage of Price Is NOT missing:
((len(data["formatted_price"]) - data["formatted_price"].isnull().sum()) / len(data["formatted_price"])) * 100
data["formatted_price"].describe()
# very good count of title
data["title"].describe()
# Make a copy of dataset
data2 = data.copy()
# Drop missing values from price column:
data2 = data2.loc[~data["formatted_price"].isnull()]
data2.shape
# Drop missing values from color column:
data2 = data2.loc[~data["color"].isnull()]
data2.shape
# Count of duplicates:
data2.duplicated("title").sum()
# 1-Ignore rows less than 4 words in "title"
# count:
data2['title'].apply(lambda x: len(x.split())<=4).sum()
# Make list Comp.
data3 = data2[data2['title'].apply(lambda x: len(x.split())>4)]
data3.shape
# Sort data to compare eachother in title column
data3.sort_values('title',inplace=True, ascending=False)
data3.head()
# we make here;
# in title column,
# compere 1.row and 2.row than, 2-3, 3-4,.....
# if difference bigger than "2 words",
# we keep them.
duplicates = []
for i in range(len(data3)-1):
#if i < (len(data3)-1): # to provide getting an error:i+1
a = data3['title'].iloc[i].split()
b = data3['title'].iloc[i+1].split()
# keep the maximum length of two strings
length = max(len(a), len(b))
# to keep matches words' count
count = 0
# itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
for k in itertools.zip_longest(a,b):
if k[0] == k[1]:
count += 1
if (length - count) > 2: # number of words in which both sensences differ
# if both strings are differ by more than 2 words we include the 1st string index
duplicates.append(data3['asin'].iloc[i])
#duplicates.append(data3['asin'].iloc[i+1])
# Drop duplicates:
data4 = data3.loc[data['asin'].isin(duplicates)]
data4.shape
data5 = data4.copy()
indices = []
for i,row in data5.iterrows():
indices.append(i)
# compare sentences and drop lenght<3
duplicates = []
while len(indices)!=0:
i = indices.pop()
duplicates.append(data5['asin'].loc[i])
# consider the first apperal's title
a = data5['title'].loc[i].split()
# store the list of words of ith string in a, ex: a = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
for j in indices:
b = data5['title'].loc[j].split()
# store the list of words of jth string in b, ex: b = ['tokidoki', 'The', 'Queen', 'of', 'Diamonds', 'Women's', 'Shirt', 'X-Large']
length = max(len(a),len(b))
# count is used to store the number of words that are matched in both strings
count = 0
# itertools.zip_longest(a,b): will map the corresponding words in both strings, it will appened None in case of unequal strings
# example: a =['a', 'b', 'c', 'd']
# b = ['a', 'b', 'd']
# itertools.zip_longest(a,b): will give [('a','a'), ('b','b'), ('c','d'), ('d', None)]
for k in itertools.zip_longest(a,b):
if (k[0]==k[1]):
count += 1
# if the number of words in which both strings differ are < 3 , we are considering it as those two apperals are same, hence we are ignoring them
if (length - count) < 3:
indices.remove(j)
data6 = data5.loc[data['asin'].isin(duplicates)]
data6.shape
# drop the special chars in review like '"#$@!%^&*()_+-~?>< etc.
data6["title"] = data6["title"].str.replace("[^\w\s]","")
data6.head()
# Make lower characters:
data6["title"] = data6["title"].apply(lambda x: " ".join(i.lower() for i in x.split()))
data6.head()
# Drop Stop-Words:
sw = stopwords.words("english")
data6["title"] = data6["title"].apply(lambda x: " ".join(i for i in x.split() if i not in sw))
data6.head()
data6.to_pickle("data6")
#st = PorterStemmer()
# data6["title"] = data6["title"].apply(lambda x: " ".join(st.stem(word) for word in x))
vectorizer = CountVectorizer()
vectorizer_title = vectorizer.fit_transform(data6["title"])
vectorizer_title.get_shape()
def count_model(ID,num_results):
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
# Compute the distance matrix from a vector array X and optional Y.
# sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean')
pairwise_distance = pairwise_distances(vectorizer_title,vectorizer_title[ID])
# Returns the indices that would sort an array.
# we make flatten the matris to make in one arary.
indices = np.argsort(pairwise_distance.flatten())[0:num_results]
# hide the smallest euclidean distance
# it was not necessary for prediction, just want to show distance
smallest_distance = np.sort(pairwise_distance.flatten())[0:num_results]
# use indices in dataset to find real positions
df_indices = list(data6.index[indices])
for i in range(len(df_indices)):
print("Title: ",data6['title'].loc[df_indices[i]])
print("Distance",smallest_distance[i])
# take image's url
url = data6['medium_image_url'].loc[df_indices[i]]
response = requests.get(url)
img = Image.open(BytesIO(response.content))
# show image
plt.imshow(img)
plt.show()
# indices: array([ 1354, 1353, 15143, 8171, 7520], dtype=int64)
count_model(15143,5)
count_model(2983,10)
count_model(12920,5)
# Make Model:
tf_idf_word_vectorizer = TfidfVectorizer()
tf_idf_title = tf_idf_word_vectorizer.fit_transform(data6["title"])
def tf_idf_model(ID,num_results):
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
# Compute the distance matrix from a vector array X and optional Y.
# sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean')
pairwise_distance = pairwise_distances(tf_idf_title,tf_idf_title[ID])
# Returns the indices that would sort an array.
# we make flatten the matris to make in one arary.
indices = np.argsort(pairwise_distance.flatten())[0:num_results]
# hide the smallest euclidean distance
# it was not necessary for prediction, just want to show distance
smallest_distance = np.sort(pairwise_distance.flatten())[0:num_results]
# use indices in dataset to find real positions
df_indices = list(data6.index[indices])
for i in range(len(df_indices)):
print("Title: ",data6['title'].loc[df_indices[i]])
print("Distance",smallest_distance[i])
# take image's url
url = data6['medium_image_url'].loc[df_indices[i]]
response = requests.get(url)
img = Image.open(BytesIO(response.content))
# show image
plt.imshow(img)
plt.show()
tf_idf_model(15143,5)
tf_idf_model(2983,10)
tf_idf_model(12920,5)
tf_idf_model(4946,5)
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin",binary=True)
# count of words
vocab = model.wv.vocab
len(vocab)
# create the new word2vec
def w2v_vectorize(x):
sent_vectors = []
coeff = 0
vector = np.zeros((300,))
for word in x.split():
coeff += 1
if word in vocab:
vector += model[word]
if coeff != 0:
vector = vector / coeff
return np.array(vector)
avg_word2vector = []
for i in data6['title']:
avg_word2vector.append(w2v_vectorize(i))
def avg_w2v_model(ID,num_results):
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
# Compute the distance matrix from a vector array X and optional Y.
# sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean')
pairwise_distance = pairwise_distances(avg_word2vector,avg_word2vector[ID].reshape(1,-1))
# Returns the indices that would sort an array.
# we make flatten the matris to make in one arary.
indices = np.argsort(pairwise_distance.flatten())[0:num_results]
# hide the smallest euclidean distance
# it was not necessary for prediction, just want to show distance
smallest_distance = np.sort(pairwise_distance.flatten())[0:num_results]
# use indices in dataset to find real positions
df_indices = list(data6.index[indices])
for i in range(len(df_indices)):
print("Title: ",data6['title'].loc[df_indices[i]])
print("Distance",smallest_distance[i])
# take image's url
url = data6['medium_image_url'].loc[df_indices[i]]
response = requests.get(url)
img = Image.open(BytesIO(response.content))
# show image
plt.imshow(img)
plt.show()
avg_w2v_model(1514,10)
avg_w2v_model(3043,10)
avg_w2v_model(12020,5)
avg_w2v_model(4946,10)
# create the new word2vec
def tfidf_w2v_vectorize(x,ID):
sent_vectors = []
coeff = 0
vector = np.zeros((300,))
for word in x.split():
coeff += 1
if word in vocab and word in tf_idf_word_vectorizer.vocabulary_:
vector += model[word] * tf_idf_title[ID,tf_idf_word_vectorizer.vocabulary_[word]]
if coeff != 0:
vector = vector / coeff
return np.array(vector)
tfidf_word2vector = []
ID = 0
for i in data6['title']:
tfidf_word2vector.append(tfidf_w2v_vectorize(i,ID))
ID += 1
def tfidf_w2v_model(ID,num_results):
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
# Compute the distance matrix from a vector array X and optional Y.
# sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean')
pairwise_distance = pairwise_distances(tfidf_word2vector,tfidf_word2vector[ID].reshape(1,-1))
# Returns the indices that would sort an array.
# we make flatten the matris to make in one arary.
indices = np.argsort(pairwise_distance.flatten())[0:num_results]
# hide the smallest euclidean distance
# it was not necessary for prediction, just want to show distance
smallest_distance = np.sort(pairwise_distance.flatten())[0:num_results]
# use indices in dataset to find real positions
df_indices = list(data6.index[indices])
for i in range(len(df_indices)):
print("Title: ",data6['title'].loc[df_indices[i]])
print("Distance",smallest_distance[i])
# take image's url
url = data6['medium_image_url'].loc[df_indices[i]]
response = requests.get(url)
img = Image.open(BytesIO(response.content))
# show image
plt.imshow(img)
plt.show()
tfidf_w2v_model(4946,10)
tfidf_w2v_model(12020,5)
tfidf_w2v_model(3043,10)
tfidf_w2v_model(6677,10)
data6['brand'].fillna(value="Not given", inplace=True)
# replace spaces with hypen
brands = [x.replace(" ", "-") for x in data6['brand'].values]
colors = [str(x).replace(" ", "-") for x in data6['color'].values]
type_name = [str(x).replace(" ", "-") for x in data6['product_type_name'].values]
# TF-IDF
tf_idf_brands_vectorizer = TfidfVectorizer()
tf_idf_brands = tf_idf_brands_vectorizer.fit_transform(brands)
tf_idf_colors_vectorizer = TfidfVectorizer()
tf_idf_colors = tf_idf_colors_vectorizer.fit_transform(colors)
tf_idf_type_name_vectorizer = TfidfVectorizer()
tf_idf_type_name = tf_idf_type_name_vectorizer.fit_transform(colors)
color_brand = hstack((tf_idf_brands, tf_idf_colors,tf_idf_type_name)).tocsr()
def tf_idf_color_brand_model(ID,num_results,w1,w2):
# w1: weight for tfidf
# w2: weight for color&brand features
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
# Compute the distance matrix from a vector array X and optional Y.
# sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean')
tfidf_distance = pairwise_distances(tfidf_word2vector,tfidf_word2vector[ID].reshape(1,-1))
color_brand_distance = pairwise_distances(color_brand,color_brand[ID])
pairwise_distance = (w1 * tfidf_distance + w2 * color_brand_distance)/float(w1 + w2)
# Returns the indices that would sort an array.
# we make flatten the matris to make in one arary.
indices = np.argsort(pairwise_distance.flatten())[0:num_results]
# hide the smallest euclidean distance
# it was not necessary for prediction, just want to show distance
smallest_distance = np.sort(pairwise_distance.flatten())[0:num_results]
# use indices in dataset to find real positions
df_indices = list(data6.index[indices])
for i in range(len(df_indices)):
print("Title: ",data6['title'].loc[df_indices[i]])
print("Distance",smallest_distance[i])
# take image's url
url = data6['medium_image_url'].loc[df_indices[i]]
response = requests.get(url)
img = Image.open(BytesIO(response.content))
# show image
plt.imshow(img)
plt.show()
tf_idf_color_brand_model(10877,10,1,100)
tf_idf_color_brand_model(10877,10,100,1)
tf_idf_color_brand_model(15143,10,1,1)
tf_idf_color_brand_model(15143,10,1,100)
tf_idf_color_brand_model(12920,10,1,100)
import tensorflow.keras
import pandas as pd
import sklearn as sk
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Flatten, Dense
from tensorflow.keras import applications
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import requests
from PIL import Image
import pandas as pd
import pickle
# VGG16 models without the classifier part of the model by specifying the “include_top” argument to “False”,
model = applications.VGG16(weights='imagenet', include_top=False)
model.summary()
#Function to compute VGG-16 CNN for image feature extraction
# dimensions of our images.
img_width, img_height = 224, 224
top_model_weights_path = 'bottleneck_fc_model.h5'
train_data_dir = '16k_images/'
nb_train_samples = 16032
epochs = 50
batch_size = 1
def feature_exract():
asins = []
datagen = ImageDataGenerator(rescale=1. / 255)
generator = datagen.flow_from_directory(
train_data_dir,
target_size=(img_width, img_height),
batch_size=batch_size,
class_mode=None,
shuffle=False)
for i in generator.filenames: # take labels
asins.append(i[2:-5])
cnn_features = model.predict_generator(generator, nb_train_samples // batch_size)
cnn_features = cnn_features.reshape((nb_train_samples,25088))
np.save(open('cnn_features.npy', 'wb'), cnn_features)
np.save(open('cnn_feature_asins.npy', 'wb'), np.array(asins))
#load the features and corresponding ASINS info.
cnn_features = np.load('cnn_features.npy')
cnn_feature_asins = np.load('cnn_feature_asins.npy')
cnn_feature_asins = list(cnn_feature_asins)
# load the original 16K dataset
df_asins = list(data6['asin'])
def cnn(ID,num_results):
doc_id = asins.index(df_asins[ID])
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
# Compute the distance matrix from a vector array X and optional Y.
# sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean')
pairwise_distance = pairwise_distances(cnn_features,cnn_features[ID].reshape(1,-1))
# Returns the indices that would sort an array.
# we make flatten the matris to make in one arary.
indices = np.argsort(pairwise_distance.flatten())[0:num_results]
# hide the smallest euclidean distance
# it was not necessary for prediction, just want to show distance
smallest_distance = np.sort(pairwise_distance.flatten())[0:num_results]
# use indices in dataset to find real positions
df_indices = list(data6.index[indices])
for i in range(len(df_indices)):
rows = data6[['medium_image_url','title']].loc[data6['asin']==asins[indices[i]]]
for indx, row in rows.iterrows():
print("Title: ",row['title'])
print("Distance",smallest_distance[i])
# take image's url
url = row['medium_image_url']
response = requests.get(url)
img = Image.open(BytesIO(response.content))
# show image
plt.imshow(img)
plt.show()
cnn(12530,10)
cnn(7766,10)
cnn(6677,10)
cnn(9933,10)
cnn(3399,10)
def combine_models(ID,num_results,w1,w2,w3):
# w1: weight for tfidf
# w2: weight for color&brand features
# w3: cnn weight
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html
# Compute the distance matrix from a vector array X and optional Y.
# sklearn.metrics.pairwise.pairwise_distances(X, Y=None, metric='euclidean')
tf_idf_distance = pairwise_distances(tf_idf_title,tf_idf_title[ID])
color_brand_distance = pairwise_distances(color_brand,color_brand[ID])
cnn_distance = pairwise_distances(cnn_features,cnn_features[ID].reshape(1,-1))
#
pairwise_distance = (w1 * tf_idf_distance + w2 * color_brand_distance + w3 * cnn_distance)/float(w1 + w2 + w3)
# Returns the indices that would sort an array.
# we make flatten the matris to make in one arary.
indices = np.argsort(pairwise_distance.flatten())[0:num_results]
# hide the smallest euclidean distance
# it was not necessary for prediction, just want to show distance
smallest_distance = np.sort(pairwise_distance.flatten())[0:num_results]
# use indices in dataset to find real positions
df_indices = list(data6.index[indices])
for i in range(len(df_indices)):
print("Title: ",data6['title'].loc[df_indices[i]])
print("Distance",smallest_distance[i])
# take image's url
url = data6['medium_image_url'].loc[df_indices[i]]
response = requests.get(url)
img = Image.open(BytesIO(response.content))
# show image
plt.imshow(img)
plt.show()